home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.cs.arizona.edu
/
ftp.cs.arizona.edu.tar
/
ftp.cs.arizona.edu
/
icon
/
newsgrp
/
group96b.txt
/
000042_icon-group-sender _Fri Oct 25 11:00:09 1996.msg
< prev
next >
Wrap
Internet Message Format
|
1997-01-02
|
9KB
Received: by cheltenham.cs.arizona.edu; Fri, 25 Oct 1996 12:58:04 MST
Date: Fri, 25 Oct 1996 11:00:09 -0700
From: Gregg Townsend <gmt>
Message-Id: <9610251800.AA12975@hawk.CS.Arizona.EDU>
To: nr@viper.cs.Virginia.EDU
Subject: Re: Icon code for working with HTML
Cc: icon-group
Errors-To: icon-group-errors@cs.arizona.edu
Here's some code for parsing HTML files. It isn't exactly what
you're looking for, but it may help.
Gregg Townsend / gmt@CS.Arizona.EDU / +1 520 621 4325 / 32 13 45N 110 57 16W
Computer Science / Univ of Arizona / 1040 E 4th St / Tucson AZ 85721-0077
############################################################################
#
# File: html.icn
#
# Subject: Procedures for parsing HTML
#
# Author: Gregg M. Townsend
#
# Date: August 12, 1996
#
############################################################################
#
# These procedures parse HTML files:
#
# htchunks(f) generates the basic chunks -- tags and text --
# that compose an HTML file.
#
# htrefs(f) generates the tagname/keyword/value combinations
# that reference other files.
#
# These procedures process strings from HTML files:
#
# httag(s) extracts the name of a tag.
#
# htvals(s) generates the keyword/value pairs from a tag.
#
# urlmerge(base,new) interprets a new URL in the context of a base.
#
############################################################################
#
# htchunks(f) generates the HTML chunks from file f.
# It returns strings beginning with
#
# <!-- for unclosed comments (legal comments are deleted)
# < for tags (will end with ">" unless unclosed at EOF)
# anything else for text
#
# At this level entities such as & are left unprocessed and all
# whitespace is preserved, including newlines.
#
############################################################################
#
# htrefs(f) extracts file/url references from within an HTML file
# and generates a string of the form
# tagname keyword value
# for each refrerence.
#
# A single space character separates the three fields, but if no
# value is supplied for the keyword, no space follows the keyword.
# Tag and keyword names are always returned in upper case.
#
# Quotation marks are stripped from the value, but note that the
# value can contain spaces or other special characters (although
# by strict HTML rules it probably shouldn't).
#
# A table in the code determines which fields are references to
# other files. For example, with <IMG>, SRC= is a reference but
# WIDTH= is not. The table will probably never be perfect
# considering the mutation rate of HTML in actual practice.
#
############################################################################
#
# httag(s) extracts and returns the tag name from within an HTML
# tag string of the form "<tagname...>". The tag name is returned
# in upper case.
#
############################################################################
#
# htvals(s) generates the tag values contained within an HTML tag
# string of the form "<tagname kw=val kw=val ...>". For each
# keyword=value pair beyond the tagname, a string of the form
#
# keyword value
#
# is generated. One space follows the keyword, which is returned
# in upper case, and quotation marks are stripped from the value.
# The value itself can be an empty string.
#
# For each keyword given without a value, the keyword is generated
# in upper case with no following space.
#
# Parsing is somewhat tolerant of errors.
#
############################################################################
#
# urlmerge(base,new) interprets a full or partial new URL in the
# context of a base URL, returning the combined URL.
#
# Here are some examples of applying urlmerge() with a base value
# of "http://www.vcu.edu/misc/sched.html" and a new value as given:
#
# new result
# ------------- -------------------
# #tuesday http://www.vcu.edu/misc/sched.html#tuesday
# bulletin.html http://www.vcu.edu/misc/bulletin.html
# ./results.html http://www.vcu.edu/misc/results.html
# images/rs.gif http://www.vcu.edu/misc/images/rs.gif
# ../ http://www.vcu.edu/
# /greet.html http://www.vcu.edu/greet.html
# file:a.html file:a.html
#
# Path components of "./" and "../" at the beginning of the
# new URL are handled specially to produce a simpler result.
# No other simplifications are applied.
#
############################################################################
# htchunks(f) -- generate HTML chunks from file f
procedure htchunks(f) #: generate chunks of HTML file
local prev, line, s
"" ? repeat {
if pos(0) then
&subject := (read(f) || "\n") | fail
if ="<!--" then
suspend htc_comment(f) # fails if comment is legal
else if ="<" then
suspend htc_tag(f) # generate tag
else
suspend htc_text(f) # generate text chunk
}
end
procedure htc_tag(f)
local s
s := "<"
repeat {
if s ||:= tab(upto('>') + 1) then
return s # completed tag
s ||:= tab(0)
&subject := (read(f) || "\n") | break
}
return s # unclosed tag
end
procedure htc_comment(f)
local s
s := ""
repeat {
if s ||:= tab(find('-->') + 3) then
fail # normal case: discard comment
s ||:= tab(0)
&subject := (read(f) || "\n") | break
}
&subject := s # rescan unclosed comment
return "<!--" # return error indicator
end
procedure htc_text(f)
local s
s := ""
repeat {
if s ||:= tab(upto('<')) then
return s
s ||:= tab(0)
&subject := (read(f) || "\n") | return s
}
end
## htrefs(f) -- generate references from HTML file f
procedure htrefs(f) #: generate references from HTML file
local tag, tagname, kwset, s
static ttable
initial {
ttable := table()
ttable["A"] := set(["HREF"])
ttable["ARE"] := set(["HREF"])
ttable["BASE"] := set(["HREF"])
ttable["BODY"] := set(["BACKGROUND"])
ttable["FORM"] := set(["ACTION"])
ttable["IMG"] := set(["SRC", "LOSRC", "USEMAP"])
ttable["INPUT"] := set(["SRC"])
ttable["LINK"] := set(["HREF"])
}
every tag := htchunks(f) do {
tagname := httag(tag) | next
kwset := \ttable[tagname] | next
every s := htvals(tag) do
if member(kwset, s ? tab(upto(' '))) then
suspend tagname || " " || s
}
end
## httag(s) -- return the name of the HTML tag s
procedure httag(s) #: extract name of HTML tag
static idset, wset, lcase, ucase
initial {
idset := &letters ++ &digits ++ '.-'
wset := ' \t\r\n\v\f'
lcase := string(&lcase)
ucase := string(&ucase)
}
s ? {
="<" | fail
tab(many(wset))
return map(tab(many(idset)), lcase, ucase)
}
end
## htvals(s) -- generate tag values of HTML tag s
procedure htvals(s) #: generate values in HTML tag
local kw
static idset, wset, qset, lcase, ucase
initial {
idset := &letters ++ &digits ++ '.-'
wset := ' \t\r\n\v\f'
qset := wset ++ '>'
lcase := string(&lcase)
ucase := string(&ucase)
}
s ? {
="<" | fail
tab(many(wset))
tab(many(idset)) | fail # no name
repeat {
tab(upto(idset)) | fail
kw := map(tab(many(idset)), lcase, ucase)
tab(many(wset))
if ="=" then {
tab(many(wset))
kw ||:= " "
if ="\"" then {
kw ||:= tab(upto('"') | 0)
tab(any('"'))
}
else if ="'" then {
kw ||:= tab(upto('\'') | 0)
tab(any('\''))
}
else
kw ||:= tab(upto(qset) | 0)
}
suspend kw
}
}
end
# urlmerge(base,new) -- merge URLs
procedure urlmerge(base, new) #: merge URLs
local protocol, host, path
static notslash
initial notslash := ~'/'
if new ? (tab(many(&letters)) & =":") then
return new # new is fully specified
base ? {
protocol := (tab(many(&letters)) || =":") | ""
host := (="//" || tab(upto('/') | 0)) | ""
path := tab(upto('#') | 0)
}
new ? {
if ="#" then
return protocol || host || path || new
if ="/" then
return protocol || host || new
while (="." & (="/" | pos(0))) |
(=".." & (="/" | pos(0)) & (path := url_trim(path)))
return protocol || host || trim(path, notslash) || tab(0)
}
end
# url_trim(path) -- trim trailing dir provided that at least one "/" remains
procedure url_trim(path)
static notslash
initial notslash := ~'/'
reverse(path) ? { # work from back end
tab(upto('/') + 1) | fail # trim dir, fail if no "/"
if =".." & (="/" | pos(0))
then fail # don't trim a ".." component
path := reverse(tab(0)) # otherwise use the rest
}
if upto('/', path) then # one / must remain to be valid
return path
else
fail
end